import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from termcolor import colored
pd.options.display.max_colwidth = 200
newsio = pd.read_csv("/Users/LJL/Documents/\
GU 2021 Fall/Anly 501/Data Cleaning/newsio_tfidf.csv", index_col=0)
newsio = newsio.drop(["hong", "kong"], axis=1)
newsio.loc[(newsio.label == False),'data_label'] = 0
newsio.loc[(newsio.label == True),'data_label'] = 1
newsio["publish_time"] = pd.to_datetime(newsio["publish_time"])
print(len(newsio))
newsio.head(1)
6496
| source_name | publish_time | text_info | data_label | aa | aapl | aaron | ab | abandon | abate | ... | zu | zurich | zweig | zählende | área | ásia | émettrice | آســــــيا | أصلي | ود | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Bloomberg | 2019-01-08 | bitcoin mining chip maker canaan considers u.s. ipo canaan inc., china’s second-biggest maker of bitcoin mining hardware, is considering listing in the u.s. after shelving plans for a hong kong i... | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 rows × 10798 columns
labels = newsio[["label"]]
features = newsio.drop(["source_name", "publish_time", "text_info", "data_label"], axis=1)
features.head(3)
| aa | aapl | aaron | ab | abandon | abate | abatin | abbisko | abct | abdo | ... | zu | zurich | zweig | zählende | área | ásia | émettrice | آســــــيا | أصلي | ود | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 rows × 10794 columns
from sklearn.decomposition import TruncatedSVD
trunc = TruncatedSVD(n_components=300)
features_trunc = trunc.fit_transform(features)
def show_key_words(features_array, labels):
features = pd.DataFrame(features_array)
features["label"] = labels
key_words = []
features_label_sum = features.groupby(["label"]).sum()
features_label_sum = features_label_sum.reset_index()
features_label_count = features.groupby(["label"]).count()
for label in sorted(list(set(labels))):
features_key = features_label_sum[features_label_sum["label"] == label]
features_article_label = features[features["label"] == label]
features_key = features_key.drop("label", axis=1)
features_key = features_key.T
news_count = features_label_count.loc[label,features_label_count.columns[0]]
features_key = features_key.sort_values(features_key.columns[0], ascending=False)
top10 = list(features_key.index[:10])
key_words.append([label, news_count, top10])
key_words_df = pd.DataFrame(key_words, columns=["Label", "News Count", "Top 10 Words"])
key_words_df = key_words_df.set_index("Label")
return key_words_df
from sklearn.cluster import DBSCAN
db_model = DBSCAN(min_samples=10).fit(features_trunc)
db_labels = db_model.labels_
print(len(set(db_labels)))
key_words = show_key_words(features, db_labels)
key_words
18
| News Count | Top 10 Words | |
|---|---|---|
| Label | ||
| -1 | 3832 | [numbr, china, say, chinese, stock, law, security, share, protest, city] |
| 0 | 2356 | [numbr, china, announce, company, research, new, release, globe, newswire, group] |
| 1 | 24 | [news, international, reuters, multimedia, profession, thomson, division, trust, reach, provide] |
| 2 | 31 | [maoyan, entertainment, empower, innovative, internet, provide, platform, service, lead, company] |
| 3 | 36 | [futu, numbr, brokerage, holding, platform, limit, online, drive, nasdaq, wealth] |
| 4 | 20 | [hazeltree, treasury, integrate, finance, management, portfolio, solution, york, provider, london] |
| 5 | 15 | [primeline, energy, wire, dissemination, holdings, distribution, news, service, states, united] |
| 6 | 19 | [iclick, interactive, iclk, asia, enterprise, limited, marketing, numbr, group, nasdaq] |
| 7 | 15 | [data, center, numbr, market, analysis, investment, report, opportunity, cagr, growth] |
| 8 | 19 | [solar, sky, holdings, skys, owner, announce, developer, park, operator, nasdaq] |
| 9 | 35 | [netdragon, websoft, code, community, build, internet, holdings, leader, numbr, global] |
| 10 | 13 | [spi, energy, santa, clara, calif, solution, provider, nasdaq, green, numbr] |
| 11 | 13 | [clps, incorporation, announce, nasdaq, company, today, prnewswire, numbr, subsidiary, agreement] |
| 12 | 13 | [reyna, silver, newswire, services, dissemination, corp, distribution, toronto, numbr, states] |
| 13 | 15 | [chi, med, meditech, hutchison, florham, hcm, park, aim, limited, numbr] |
| 14 | 10 | [numbr, raise, ipo, source, price, billion, knowledge, matter, hk, initial] |
| 15 | 14 | [algorithm, viyi, research, spac, venus, tickerwin, numbr, venu, merge, recently] |
| 16 | 16 | [li, jd, xpeng, nasdaq, auto, alibaba, nyse, tencent, limit, baidu] |
import scipy
from scipy.cluster import hierarchy
plt.figure(figsize=(8, 6), dpi=80)
dendro=hierarchy.dendrogram(hierarchy.linkage(features_trunc,method='ward'))
plt.axhline(6, color='r')# cut at 6 to get 11 clusters
<matplotlib.lines.Line2D at 0x151dcd1c0>
from sklearn.cluster import AgglomerativeClustering
agg = AgglomerativeClustering(n_clusters=11, affinity='euclidean', linkage='ward')
hierarchy_labels = agg.fit_predict(features_trunc)
print(len(set(hierarchy_labels)))
key_words = show_key_words(features, hierarchy_labels)
key_words
11
| News Count | Top 10 Words | |
|---|---|---|
| Label | ||
| 0 | 2571 | [numbr, announce, china, company, new, coronavirus, year, exchange, newswire, group] |
| 1 | 661 | [stock, china, trump, share, trade, sanction, market, investor, fall, tension] |
| 2 | 1802 | [china, say, law, security, protest, national, city, democracy, beijing, activist] |
| 3 | 171 | [research, wimi, ar, hologram, recently, mobiustrend, numbr, release, organization, newswire] |
| 4 | 546 | [numbr, billion, ipo, alibaba, list, raise, group, share, jd, ant] |
| 5 | 118 | [distribution, indirectly, canada, australia, directly, release, publication, japan, states, united] |
| 6 | 486 | [police, protester, protest, tear, gas, anti, government, clash, violence, university] |
| 7 | 39 | [ucloudlink, ucl, sharing, traffic, mobile, marketplace, group, datum, numbr, nasdaq] |
| 8 | 36 | [futu, numbr, brokerage, holding, platform, limit, online, drive, nasdaq, wealth] |
| 9 | 35 | [netdragon, websoft, code, community, build, internet, holdings, leader, numbr, global] |
| 10 | 31 | [maoyan, entertainment, empower, innovative, internet, provide, platform, service, lead, company] |
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples
import matplotlib
range_n_clusters = [x for x in range(2,50)]
scores = []
cluster_nums = []
for n_clusters in range_n_clusters:
clusterer = KMeans(n_clusters=n_clusters, random_state=10)
cluster_labels = clusterer.fit_predict(features_trunc)
silhouette_avg = silhouette_score(features_trunc, cluster_labels)
# print("n_clusters =", n_clusters,
# "score_silhouette:", silhouette_avg)
cluster_nums.append(n_clusters)
scores.append(silhouette_avg)
plt.plot(cluster_nums, scores)
plt.show()
plt.plot(cluster_nums, scores)
plt.axvline(x=28, color='red', linestyle='--')
plt.axvline(x=9, color='red', linestyle='--')
plt.axvline(x=3, color='red', linestyle='--')
plt.show()
print(np.argmax(scores))
print(cluster_nums[37])
38 39
# num_clus = [x for x in range(5,40)]
# squared_errors = []
# for cluster in num_clus:
# kmeans = KMeans(n_clusters = cluster).fit(features_trunc) # Train Cluster
# squared_errors.append(kmeans.inertia_) # Appending the squared loss obtained in the list
# optimal_clusters = np.argmin(squared_errors) + 2 # As argmin return the index of minimum loss.
# plt.plot(num_clus, squared_errors)
# plt.title("Elbow Curve to find the no. of clusters.")
# plt.xlabel("Number of clusters.")
# plt.ylabel("Squared Loss.")
# xy = (optimal_clusters, min(squared_errors))
# plt.annotate('(%s, %s)' % xy, xy = xy, textcoords='data')
# plt.show()
# print ("The optimal number of clusters obtained is - ", optimal_clusters)
# print ("The loss for optimal cluster is - ", min(squared_errors))
import plotly.express as px
kmeans = KMeans(n_clusters = 4).fit(features_trunc)
labels_4 = kmeans.labels_
key_words = show_key_words(features, labels_4)
newsio["cluster"] = labels_4
news_by_label = newsio.groupby(["publish_time", "cluster"])\
.agg({"text_info":"count"}).reset_index()
news_by_label_top1 = news_by_label.groupby(["publish_time", "cluster"])\
.apply(lambda x: x.nlargest(1,['text_info'])).reset_index(drop=True)
plot_df = news_by_label_top1.pivot(index="publish_time", columns="cluster",\
values="text_info")
fig = px.line(plot_df)
fig.update_xaxes(
dtick="M1",
tickformat="%b\n%Y")
fig.show()
fig.write_html("4_clusters.html")
key_words
| News Count | Top 10 Words | |
|---|---|---|
| Label | ||
| 0 | 2096 | [numbr, billion, company, announce, group, share, list, newswire, globe, raise] |
| 1 | 3583 | [china, protest, say, protester, police, city, chinese, numbr, government, stock] |
| 2 | 656 | [security, law, national, china, new, say, impose, beijing, legislation, city] |
| 3 | 161 | [research, ar, wimi, hologram, recently, mobiustrend, release, organization, numbr, newswire] |
kmeans = KMeans(n_clusters = 9).fit(features_trunc)
labels_9 = kmeans.labels_
key_words = show_key_words(features, labels_9)
newsio["cluster"] = labels_9
news_by_label = newsio.groupby(["publish_time", "cluster"])\
.agg({"text_info":"count"}).reset_index()
news_by_label_top1 = news_by_label.groupby(["publish_time", "cluster"])\
.apply(lambda x: x.nlargest(1,['text_info'])).reset_index(drop=True)
plot_df = news_by_label_top1.pivot(index="publish_time", columns="cluster",\
values="text_info")
fig = px.line(plot_df)
fig.update_xaxes(
dtick="M1",
tickformat="%b\n%Y")
fig.show()
key_words
| News Count | Top 10 Words | |
|---|---|---|
| Label | ||
| 0 | 180 | [trump, president, donald, china, trade, response, say, deal, law, sanction] |
| 1 | 575 | [stock, share, numbr, market, china, trade, investor, fall, gain, rise] |
| 2 | 493 | [numbr, billion, raise, ipo, list, alibaba, ant, group, share, source] |
| 3 | 2530 | [china, numbr, say, coronavirus, chinese, protest, city, new, year, government] |
| 4 | 542 | [security, law, national, china, new, say, impose, beijing, legislation, city] |
| 5 | 208 | [research, ar, wimi, hologram, numbr, maoyan, recently, release, mobiustrend, organization] |
| 6 | 880 | [police, protester, protest, government, democracy, leader, city, pro, anti, lam] |
| 7 | 118 | [distribution, indirectly, canada, australia, directly, release, publication, japan, states, united] |
| 8 | 970 | [numbr, announce, company, newswire, globe, today, prnewswire, group, nasdaq, lead] |
kmeans = KMeans(n_clusters = 28).fit(features_trunc)
labels_28 = kmeans.labels_
key_words = show_key_words(features, labels_28)
newsio["cluster"] = labels_28
news_by_label = newsio.groupby(["publish_time", "cluster"])\
.agg({"text_info":"count"}).reset_index()
news_by_label_top1 = news_by_label.groupby(["publish_time", "cluster"])\
.apply(lambda x: x.nlargest(1,['text_info'])).reset_index(drop=True)
plot_df = news_by_label_top1.pivot(index="publish_time", columns="cluster",\
values="text_info")
fig = px.line(plot_df)
fig.update_xaxes(
dtick="M1",
tickformat="%b\n%Y")
fig.show()
fig.write_html("28_clusters.html")
key_words
| News Count | Top 10 Words | |
|---|---|---|
| Label | ||
| 0 | 40 | [nba, tweet, basketball, houston, support, game, rockets, association, league, backlash] |
| 1 | 531 | [stock, share, trade, china, market, investor, fall, numbr, gain, index] |
| 2 | 528 | [security, law, national, china, say, impose, new, legislation, beijing, city] |
| 3 | 2359 | [china, say, protest, chinese, numbr, city, coronavirus, government, leader, new] |
| 4 | 118 | [distribution, indirectly, canada, australia, directly, release, publication, japan, states, united] |
| 5 | 1107 | [numbr, announce, company, prnewswire, newswire, globe, today, group, global, result] |
| 6 | 31 | [maoyan, entertainment, empower, innovative, internet, provide, platform, service, lead, company] |
| 7 | 30 | [news, international, reuters, profession, multimedia, thomson, division, trust, reach, provide] |
| 8 | 27 | [data, center, numbr, market, analysis, report, researchandmarket, com, datum, add] |
| 9 | 55 | [cathay, pacific, airways, numbr, airline, pilot, carrier, billion, hk, swire] |
| 10 | 43 | [exchange, lse, london, stock, billion, takeover, clearing, numbr, bid, bourse] |
| 11 | 519 | [police, protester, protest, tear, gas, government, anti, pro, democracy, clash] |
| 12 | 39 | [ucloudlink, ucl, sharing, traffic, mobile, marketplace, group, datum, numbr, nasdaq] |
| 13 | 72 | [lai, jimmy, tycoon, arrest, medium, media, security, national, law, charge] |
| 14 | 14 | [algorithm, viyi, research, spac, venus, tickerwin, numbr, venu, merge, recently] |
| 15 | 50 | [evergrande, property, developer, china, group, share, default, debt, low, stock] |
| 16 | 35 | [netdragon, websoft, code, community, build, internet, holdings, leader, numbr, global] |
| 17 | 20 | [hazeltree, treasury, integrate, finance, management, portfolio, solution, york, provider, london] |
| 18 | 277 | [numbr, billion, raise, list, source, ipo, alibaba, matter, hk, knowledge] |
| 19 | 36 | [futu, numbr, brokerage, holding, platform, limit, online, drive, nasdaq, wealth] |
| 20 | 38 | [joshua, wong, activist, democracy, prominent, jail, pro, district, election, assembly] |
| 21 | 21 | [tiananmen, vigil, square, candle, crackdown, tiananman, commemorate, ban, democracy, anniversary] |
| 22 | 161 | [research, ar, wimi, hologram, recently, mobiustrend, release, organization, numbr, newswire] |
| 23 | 105 | [jd, debut, numbr, share, com, ipo, health, billion, listing, trading] |
| 24 | 19 | [li, jd, auto, xpeng, nasdaq, alibaba, nyse, tencent, limit, xpev] |
| 25 | 43 | [hutchmed, florham, hcm, chi, numbr, park, med, aim, china, shanghai] |
| 26 | 82 | [ant, group, ipo, shanghai, billion, ma, jack, alibaba, dual, numbr] |
| 27 | 96 | [election, vote, democracy, pro, legislature, candidate, council, leader, opposition, city] |
print(newsio[newsio["text_info"].str.contains("coronavirus")])
source_name publish_time \
2145 Reuters 2020-01-21
2149 Reuters 2020-01-22
2150 Reuters 2020-01-22
2151 Reuters 2020-01-22
2157 Reuters 2020-01-23
... ... ...
5503 Reuters 2021-05-04
5504 Reuters 2021-05-04
5573 Reuters 2021-05-14
5675 CNBC 2021-06-01
5988 Reuters 2021-07-19
text_info \
2145 china virus scare sends shudder through european luxury goods sector european luxury stocks slumped across the board on tuesday on fears that the coronavirus virus outbreak in china could hurt sa...
2149 hong kong on high alert to tackle coronavirus outbreak hong kong's government is on high alert to deal with a new flu-like coronavirus that has killed nine people in mainland china, the city's co...
2150 hong kong confirms first case of new wuhan virus: local media hong kong confirmed its first case of a new flu-like coronavirus on wednesday following an outbreak in the central mainland china cit...
2151 man quarantined after hong kong's first preliminary positive test for wuhan virus hong kong quarantined a 39-year-old man on wednesday after the city's first preliminary positive result in a test...
2157 lam says hong kong has system in place to handle coronavirus hong kong leader carrie lam told global elites in davos on thursday she is "cautiously confident" the city will get through the flu-li...
... ...
5503 energy shares lift hong kong stocks as oil firms on easing curbs in u.s., europe hong kong stocks rose on tuesday, with energy shares leading gains, buoyed by rising oil prices as easing coronavi...
5504 energy leads hong kong stocks higher on pandemic recovery signs hong kong shares settled higher on tuesday, with energy stocks leading the gains on signs of recovery from the coronavirus pandemic...
5573 singapore coronavirus cases could burst hopes for hong kong travel bubble a travel bubble between hong kong and singapore set to open on may 26 has a "high chance" of being postponed, a hong kong...
5675 zero covid? taiwan outbreak shows that's not a long-term solution, says professor taiwan is "completely susceptible" to new coronavirus variants that are more transmissible, said benjamin cowling...
5988 hong kong shares fall as regulatory clampdown hits tech firms hong kong's benchmark hang seng index fell on monday as fresh investor concerns over a regulatory clampdown hobbled shares of china's...
data_label aa aapl aaron ab abandon abate ... zu zurich \
2145 0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
2149 0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
2150 0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
2151 0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
2157 0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
... ... ... ... ... ... ... ... ... ... ...
5503 0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
5504 0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
5573 0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
5675 0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
5988 0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0
zweig zählende área ásia émettrice آســــــيا أصلي ود
2145 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2149 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2150 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2151 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2157 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ...
5503 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
5504 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
5573 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
5675 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
5988 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
[387 rows x 10798 columns]
def get_cmap(n, name='hsv'):
'''Returns a function that maps each index in 0, 1, ..., n-1 to a distinct
RGB color; the keyword argument name must be a standard mpl colormap name.'''
return plt.cm.get_cmap(name, n)
from sklearn.decomposition import PCA
My_pca = PCA(n_components=2) ## I want the two prin columns
## Transpose it
smalldata_normalized=np.transpose(features_trunc)
My_pca.fit(smalldata_normalized)
print(My_pca)
print(My_pca.components_.T)
KnownLabels = labels_28
# Reformat and view results
Comps = pd.DataFrame(My_pca.components_.T,columns=['PC%s' % _ for _ in range(2)])
print(Comps)
print(Comps.iloc[:,0])
########################
## Look at 2D PCA clusters
############################################
plt.figure(figsize=(9,9))
plt.scatter(Comps.iloc[:,0], Comps.iloc[:,1], s=100, color="green")
plt.xlabel("PC 1")
plt.ylabel("PC 2")
plt.title("Scatter Plot Clusters PC 1 and 2",fontsize=15)
cmap = get_cmap(len(set(KnownLabels)))
for i, label in enumerate(KnownLabels):
plt.annotate(label, (Comps.iloc[i,0], Comps.iloc[i,1]), color=cmap(label))
plt.show()
PCA(n_components=2)
[[ 0.00860755 -0.00081099]
[ 0.00830145 0.00051715]
[ 0.01358563 0.0008009 ]
...
[ 0.01242371 -0.00873312]
[ 0.02474617 -0.01984222]
[ 0.01226681 0.00662608]]
PC0 PC1
0 0.008608 -0.000811
1 0.008301 0.000517
2 0.013586 0.000801
3 0.006971 -0.001154
4 0.006479 0.001560
... ... ...
6491 0.011184 -0.012330
6492 0.012800 0.002317
6493 0.012424 -0.008733
6494 0.024746 -0.019842
6495 0.012267 0.006626
[6496 rows x 2 columns]
0 0.008608
1 0.008301
2 0.013586
3 0.006971
4 0.006479
...
6491 0.011184
6492 0.012800
6493 0.012424
6494 0.024746
6495 0.012267
Name: PC0, Length: 6496, dtype: float64